The purpose of this document is to perform EDA on the clean_assessments.csv data to identify patterns in the data that will inform a model of sale price.
library(tidyverse)
library(lubridate)
library(sf)
library(leaflet)
library(skimr)
library(hrbrthemes)
library(janitor)
library(scales)
library(here)
#here::i_am("scripts/eda")
options(scipen = 999, digits = 4)
theme_set(theme_ipsum())
assessment_data_path <- here("data/cleaned/big", "clean_assessment_data_geocoded.csv")
unified_geo_ids_path <- here("data/cleaned/big/unified_geo_ids/unified_geo_ids.shp")
assessments_valid <- read_csv(assessment_data_path) %>%
mutate(sale_month = factor(sale_month, levels = month.abb))
geo_ids <- st_read(unified_geo_ids_path)
## Reading layer `unified_geo_ids' from data source `/Users/conortompkins/github_repos/allegheny_house_price_app/data/cleaned/big/unified_geo_ids/unified_geo_ids.shp' using driver `ESRI Shapefile'
## Simple feature collection with 80 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -80.36 ymin: 40.19 xmax: -79.69 ymax: 40.67
## Geodetic CRS: WGS 84
glimpse(assessments_valid)
## Rows: 160,186
## Columns: 29
## $ par_id <chr> "0001G00111000000", "0001G00224110600", "0001G002…
## $ usedesc <chr> "SINGLE FAMILY", "CONDOMINIUM", "CONDOMINIUM", "C…
## $ muni_desc <chr> "1st Ward - Pittsburgh", "1st Ward - Pittsburgh…
## $ sale_desc <chr> "Valid Sale", "Valid Sale", "Valid Sale", "Other …
## $ sale_price <dbl> 2000000, 333325, 400000, 387000, 425000, 245000, …
## $ year_built <dbl> 2015, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2…
## $ style_desc <chr> "Townhouse", "Condo", "Condo", "Condo", "Condo", …
## $ bedrooms <dbl> 3, 2, 2, 2, 2, 1, 2, 1, 3, 2, 2, 1, 2, 1, 3, 2, 2…
## $ full_baths <dbl> 3, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1…
## $ half_baths <dbl> 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1…
## $ finished_living_area <dbl> 2011, 1434, 1483, 1388, 1275, 889, 1666, 1107, 17…
## $ lot_area <dbl> 995, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ grade_desc <chr> "Excellent", "Very Good", "Very Good", "Very Good…
## $ condition_desc <chr> "Excellent", "Average", "Average", "Average", "Av…
## $ extfinish_desc <chr> "Brick", "Concrete", "Concrete", "Concrete", "Con…
## $ roof_desc <chr> "Roll", "Roll", "Roll", "Roll", "Roll", "Roll", "…
## $ basement_desc <chr> "Full", "None", "None", "None", "None", "None", "…
## $ cdu_desc <chr> "Excellent", "Average", "Average", "Average", "Av…
## $ heating_cooling_desc <chr> "Central Heat With AC", "Central Heat With AC", "…
## $ fireplaces <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ basement_garage <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ sale_year <dbl> 2018, 2007, 2013, 2009, 2019, 2013, 2014, 2008, 2…
## $ sale_month <fct> Oct, Dec, Sep, Oct, Jun, Sep, Jun, Aug, Feb, May,…
## $ school_desc <chr> "1st Ward Pittsburgh", "1st Ward Pittsburgh", "…
## $ house_age_at_sale <dbl> 3, 0, 6, 2, 12, 6, 7, 1, 6, 9, 6, 10, 3, 9, 0, 3,…
## $ ac_flag <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, T…
## $ heat_type <chr> "Central Heat", "Central Heat", "Central Heat", "…
## $ sale_price_adj <dbl> 2036244, 410997, 438978, 461176, 425000, 268874, …
## $ geo_id <chr> "City Council District 6", "City Council District…
skim(assessments_valid)
| Name | assessments_valid |
| Number of rows | 160186 |
| Number of columns | 29 |
| _______________________ | |
| Column type frequency: | |
| character | 15 |
| factor | 1 |
| logical | 1 |
| numeric | 12 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| par_id | 0 | 1 | 16 | 16 | 0 | 160181 | 0 |
| usedesc | 0 | 1 | 8 | 22 | 0 | 13 | 0 |
| muni_desc | 0 | 1 | 4 | 23 | 0 | 174 | 0 |
| sale_desc | 0 | 1 | 10 | 11 | 0 | 2 | 0 |
| style_desc | 0 | 1 | 3 | 13 | 0 | 20 | 0 |
| grade_desc | 0 | 1 | 4 | 13 | 0 | 7 | 0 |
| condition_desc | 22 | 1 | 4 | 9 | 0 | 8 | 0 |
| extfinish_desc | 1 | 1 | 3 | 14 | 0 | 8 | 0 |
| roof_desc | 212 | 1 | 4 | 7 | 0 | 6 | 0 |
| basement_desc | 30 | 1 | 4 | 10 | 0 | 5 | 0 |
| cdu_desc | 24 | 1 | 4 | 9 | 0 | 8 | 0 |
| heating_cooling_desc | 24 | 1 | 4 | 21 | 0 | 15 | 0 |
| school_desc | 0 | 1 | 6 | 20 | 0 | 78 | 0 |
| heat_type | 24 | 1 | 4 | 13 | 0 | 9 | 0 |
| geo_id | 0 | 1 | 7 | 23 | 0 | 61 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| sale_month | 0 | 1 | FALSE | 12 | Jun: 17138, Jul: 17079, Aug: 16741, May: 15342 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| ac_flag | 24 | 1 | 0.64 | TRU: 102989, FAL: 57173 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| sale_price | 0 | 1.00 | 128739.87 | 125072.84 | 325.0 | 55000 | 92000 | 160000 | 3070000 | ▇▁▁▁▁ |
| year_built | 0 | 1.00 | 1950.69 | 29.70 | 1797.0 | 1930 | 1953 | 1970 | 2019 | ▁▁▅▇▃ |
| bedrooms | 7 | 1.00 | 3.06 | 0.88 | 0.0 | 3 | 3 | 4 | 14 | ▂▇▁▁▁ |
| full_baths | 21 | 1.00 | 1.50 | 0.66 | 0.0 | 1 | 1 | 2 | 12 | ▇▁▁▁▁ |
| half_baths | 1000 | 0.99 | 0.54 | 0.57 | 0.0 | 0 | 0 | 1 | 9 | ▇▁▁▁▁ |
| finished_living_area | 0 | 1.00 | 1700.82 | 758.72 | 0.0 | 1188 | 1512 | 2028 | 12790 | ▇▁▁▁▁ |
| lot_area | 0 | 1.00 | 13181.22 | 37628.87 | 0.0 | 3953 | 7500 | 12930 | 4859466 | ▇▁▁▁▁ |
| fireplaces | 9064 | 0.94 | 0.42 | 0.56 | 0.0 | 0 | 0 | 1 | 7 | ▇▁▁▁▁ |
| basement_garage | 4020 | 0.97 | 0.79 | 0.84 | 0.0 | 0 | 1 | 1 | 6 | ▇▂▁▁▁ |
| sale_year | 0 | 1.00 | 2001.80 | 11.83 | 1976.0 | 1993 | 2003 | 2012 | 2020 | ▃▅▆▇▇ |
| house_age_at_sale | 0 | 1.00 | 51.12 | 30.51 | 0.0 | 28 | 50 | 71 | 219 | ▇▇▂▁▁ |
| sale_price_adj | 0 | 1.00 | 176801.24 | 146867.78 | 412.2 | 90107 | 140612 | 215593 | 3465024 | ▇▁▁▁▁ |
Inflation-adjusted sales price (sale_price_adj) is normally distributed on the log10() scale, as expected.
assessments_valid %>%
ggplot(aes(sale_price_adj)) +
geom_density() +
scale_x_log10(labels = dollar)
Adjusting for inflation (sale_price_adj is 2019 dollars) removes a lot of the drift over time.
assessments_valid %>%
select(sale_year, sale_price, sale_price_adj) %>%
pivot_longer(cols = contains("sale_price")) %>%
ggplot(aes(sale_year, value, color = name)) +
geom_smooth() +
scale_y_continuous(labels = dollar)
assessments_valid %>%
mutate(sale_month = fct_rev(sale_month)) %>%
add_count(sale_month) %>%
ggplot(aes(sale_month, sale_price_adj, group = sale_month, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_y_log10() +
coord_cartesian(ylim = c(10^4, 10^6)) +
scale_fill_viridis_c()
The location of the house (geo_id) has a big impact on sale_price_adj.
geo_id_median_price <- assessments_valid %>%
group_by(geo_id) %>%
summarize(median_price = median(sale_price_adj))
pal <- colorNumeric(
palette = "viridis",
domain = geo_id_median_price$median_price)
geo_ids %>%
left_join(geo_id_median_price) %>%
leaflet() %>%
addProviderTiles(providers$Stamen.TonerLite,
options = providerTileOptions(noWrap = TRUE,
minZoom = 9,
#maxZoom = 8
)) %>%
addPolygons(popup = ~ str_c(geo_id, " ", "median price: ", dollar(median_price), sep = ""),
fillColor = ~pal(median_price),
fillOpacity = .7,
color = "black",
weight = 3) %>%
addLegend("bottomright", pal = pal, values = ~median_price,
title = "Median sales price",
labFormat = labelFormat(prefix = "$"),
opacity = 1)
assessments_valid %>%
add_count(geo_id) %>%
mutate(geo_id = fct_reorder(geo_id, sale_price_adj, .fun = median)) %>%
ggplot(aes(sale_price_adj, geo_id, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_fill_viridis_c()
The age of the house at time of sale has a negative relationship with sale price, but is heteroskedastic. This indicates that some houses retain their value better over time (and/or that houses that don’t retain value aren’t bought and sold).
assessments_valid %>%
ggplot(aes(house_age_at_sale, sale_price_adj)) +
geom_density_2d_filled() +
scale_y_log10(labels = dollar)
There is not an obvious relationship between lot_area and sale_price_adj.
assessments_valid %>%
ggplot(aes(lot_area, sale_price_adj)) +
geom_density_2d_filled() +
scale_y_log10(labels = dollar) +
coord_cartesian(xlim = c(0, 50000))
But, lot_area varies drastically across geo_id, so there might still be a useful feature to be engineered.
assessments_valid %>%
mutate(geo_id = fct_reorder(geo_id, lot_area, .fun = median)) %>%
ggplot(aes(lot_area, geo_id)) +
geom_boxplot(outlier.alpha = 0) +
coord_cartesian(xlim = c(0, 40000))
lot_area also varies across style_desc.
assessments_valid %>%
mutate(style_desc = fct_reorder(style_desc, lot_area, .fun = median)) %>%
ggplot(aes(lot_area, style_desc)) +
geom_boxplot(outlier.alpha = 0) +
coord_cartesian(xlim = c(0, 10^5))
finished_living_area will be a useful variable.
assessments_valid %>%
ggplot(aes(finished_living_area, sale_price_adj)) +
geom_density_2d_filled() +
scale_y_log10(labels = dollar) +
coord_cartesian(xlim = c(0, 5000))
finished_living_area also varies across style_desc.
assessments_valid %>%
mutate(style_desc = fct_reorder(style_desc, finished_living_area, .fun = median)) %>%
ggplot(aes(finished_living_area, style_desc)) +
geom_boxplot(outlier.alpha = 0) +
coord_cartesian(xlim = c(0, 10^4))
finished_living_area also varies across geo_id, but not as drastically.
assessments_valid %>%
mutate(geo_id = fct_reorder(geo_id, finished_living_area, .fun = median)) %>%
ggplot(aes(finished_living_area, geo_id)) +
geom_boxplot(outlier.alpha = 0) +
coord_cartesian(xlim = c(0, 10000))
grade_desc has a clear relationship with sale_price_adj.
assessments_valid %>%
mutate(grade_desc = fct_reorder(grade_desc, sale_price_adj, median)) %>%
add_count(grade_desc) %>%
ggplot(aes(sale_price_adj, grade_desc, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(labels = dollar) +
scale_fill_viridis_c()
assessments_valid %>%
mutate(cdu_desc = fct_reorder(cdu_desc, sale_price_adj, .fun = median)) %>%
add_count(cdu_desc) %>%
ggplot(aes(sale_price_adj, cdu_desc, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_fill_viridis_c()
condition_desc also has a relationship, but some of the levels can probably be collapsed.
assessments_valid %>%
mutate(condition_desc = fct_reorder(condition_desc, sale_price_adj, median)) %>%
add_count(condition_desc) %>%
ggplot(aes(sale_price_adj, condition_desc, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(labels = dollar) +
scale_fill_viridis_c()
There are 4 main types of houses, with a lot of low-n types that can be collapsed.
assessments_valid %>%
count(style_desc) %>%
mutate(style_desc = fct_reorder(style_desc, n, .fun = max)) %>%
ggplot(aes(n, style_desc)) +
geom_col()
There is some time-series pattern in when different types of houses were created.
assessments_valid %>%
add_count(style_desc) %>%
filter(n > 5000) %>%
mutate(style_desc = fct_reorder(style_desc, n, .fun = "max", .desc = T)) %>%
ggplot(aes(year_built, fill = style_desc)) +
geom_histogram(binwdidth = 30) +
guides(fill = FALSE) +
facet_wrap(~style_desc, scales = "free_y")
Most homes have between 1 and 2 full and half bathrooms.
assessments_valid %>%
count(full_baths, half_baths) %>%
complete(full_baths = 0:12, half_baths = 0:9, fill = list(n = 0)) %>%
ggplot(aes(full_baths, half_baths, fill = n)) +
geom_tile() +
scale_fill_viridis_c() +
scale_x_continuous(breaks = c(0:12),
expand = c(0,0)) +
scale_y_continuous(breaks = c(0:12),
expand = c(0,0)) +
coord_equal() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
fullbaths and sale_price_adj are positively related.
assessments_valid %>%
add_count(full_baths) %>%
ggplot(aes(sale_price_adj, full_baths, fill = n, group = full_baths)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_y_continuous(breaks = c(0:12)) +
scale_fill_viridis_c()
There appear to be diminishing returns on the number of half bathrooms.
assessments_valid %>%
add_count(half_baths) %>%
ggplot(aes(sale_price_adj, half_baths, fill = n, group = half_baths)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_y_continuous(breaks = c(0:9)) +
scale_fill_viridis_c()
Need to split these out to heat_type and ac_flag.
assessments_valid %>%
mutate(heating_cooling_desc = fct_reorder(heating_cooling_desc, sale_price_adj, .fun = median)) %>%
add_count(heating_cooling_desc) %>%
ggplot(aes(sale_price_adj, heating_cooling_desc, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_fill_viridis_c()
Impute missing based on mode for geo_id and style_desc.
assessments_valid %>%
mutate(heat_type = fct_reorder(heat_type, sale_price_adj, .fun = median)) %>%
add_count(heat_type) %>%
ggplot(aes(sale_price_adj, heat_type, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_fill_viridis_c()
Impute missing based on mode for geo_id and style_desc.
assessments_valid %>%
mutate(ac_flag = as.character(ac_flag)) %>%
mutate(ac_flag = fct_reorder(ac_flag, sale_price_adj, .fun = median)) %>%
add_count(ac_flag) %>%
ggplot(aes(sale_price_adj, ac_flag, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_fill_viridis_c()
Impute missing based on mode for geo_id and style_desc.
assessments_valid %>%
mutate(extfinish_desc = fct_reorder(extfinish_desc, sale_price_adj, .fun = median)) %>%
add_count(extfinish_desc) %>%
ggplot(aes(sale_price_adj, extfinish_desc, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_fill_viridis_c()
Impute missing based on mode for geo_id and style_desc.
assessments_valid %>%
mutate(roof_desc = fct_reorder(roof_desc, sale_price_adj, .fun = median)) %>%
add_count(roof_desc) %>%
ggplot(aes(sale_price_adj, roof_desc, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_fill_viridis_c()
Impute missing based on mode for geo_id and style_desc.
assessments_valid %>%
mutate(basement_desc = fct_reorder(basement_desc, sale_price_adj, .fun = median)) %>%
add_count(basement_desc) %>%
ggplot(aes(sale_price_adj, basement_desc, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_fill_viridis_c()
Impute missing based on mode for geo_id and style_desc. Need to look up what this column is. Number of cars that can fit in the basement garage?
assessments_valid %>%
select(sale_price_adj, basement_garage) %>%
mutate(basement_garage = as.character(basement_garage),
basement_garage = fct_explicit_na(basement_garage),
basement_garage = fct_reorder(basement_garage, sale_price_adj, .fun = median)) %>%
add_count(basement_garage) %>%
ggplot(aes(sale_price_adj, basement_garage, group = basement_garage, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_fill_viridis_c()
Positive relationship between number of fireplaces and sale price, but most houses have 1 or 2. Consider changing to lgl fireplace_flag column where it checks fireplaces >= 1.
assessments_valid %>%
add_count(fireplaces) %>%
ggplot(aes(sale_price_adj, fireplaces, group = fireplaces, fill = n)) +
geom_boxplot(outlier.alpha = 0,
color = "grey") +
scale_x_log10(label = dollar) +
scale_fill_viridis_c()
assessments_valid %>%
ggplot(aes(lot_area)) +
geom_density() +
geom_vline(xintercept = 40000) +
scale_x_log10()
assessments_valid %>%
ggplot(aes(finished_living_area)) +
geom_density() +
scale_x_log10() +
geom_vline(xintercept = 10000)